Predicting whether stock moves Up or Down
A simple exercise to understand how to run logistic regression with stock market data using tidymodels framework.
This dataset consists of percentage returns for the S&P 500 stock market over 1,250 days, from the beginning of 2001 until the end of 2005.
This is slightly different from the usual initial_split method.
stock_market <- Smarket %>%
clean_names()
head(stock_market)
year lag1 lag2 lag3 lag4 lag5 volume today direction
1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up
glimpse(stock_market) # 1250 x 9
Rows: 1,250
Columns: 9
$ year <dbl> 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 20…
$ lag1 <dbl> 0.381, 0.959, 1.032, -0.623, 0.614, 0.213, 1.392, …
$ lag2 <dbl> -0.192, 0.381, 0.959, 1.032, -0.623, 0.614, 0.213,…
$ lag3 <dbl> -2.624, -0.192, 0.381, 0.959, 1.032, -0.623, 0.614…
$ lag4 <dbl> -1.055, -2.624, -0.192, 0.381, 0.959, 1.032, -0.62…
$ lag5 <dbl> 5.010, -1.055, -2.624, -0.192, 0.381, 0.959, 1.032…
$ volume <dbl> 1.1913, 1.2965, 1.4112, 1.2760, 1.2057, 1.3491, 1.…
$ today <dbl> 0.959, 1.032, -0.623, 0.614, 0.213, 1.392, -0.403,…
$ direction <fct> Up, Up, Down, Up, Up, Up, Down, Up, Up, Up, Down, …
summary(stock_market)
year lag1 lag2
Min. :2001 Min. :-4.922000 Min. :-4.922000
1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500
Median :2003 Median : 0.039000 Median : 0.039000
Mean :2003 Mean : 0.003834 Mean : 0.003919
3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750
Max. :2005 Max. : 5.733000 Max. : 5.733000
lag3 lag4 lag5
Min. :-4.922000 Min. :-4.922000 Min. :-4.92200
1st Qu.:-0.640000 1st Qu.:-0.640000 1st Qu.:-0.64000
Median : 0.038500 Median : 0.038500 Median : 0.03850
Mean : 0.001716 Mean : 0.001636 Mean : 0.00561
3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.59700
Max. : 5.733000 Max. : 5.733000 Max. : 5.73300
volume today direction
Min. :0.3561 Min. :-4.922000 Down:602
1st Qu.:1.2574 1st Qu.:-0.639500 Up :648
Median :1.4229 Median : 0.038500
Mean :1.4783 Mean : 0.003138
3rd Qu.:1.6417 3rd Qu.: 0.596750
Max. :3.1525 Max. : 5.733000
stock_market %>%
skim()
Name | Piped data |
Number of rows | 1250 |
Number of columns | 9 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 8 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
direction | 0 | 1 | FALSE | 2 | Up: 648, Dow: 602 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1 | 2003.02 | 1.41 | 2001.00 | 2002.00 | 2003.00 | 2004.00 | 2005.00 | ▇▇▇▇▇ |
lag1 | 0 | 1 | 0.00 | 1.14 | -4.92 | -0.64 | 0.04 | 0.60 | 5.73 | ▁▃▇▁▁ |
lag2 | 0 | 1 | 0.00 | 1.14 | -4.92 | -0.64 | 0.04 | 0.60 | 5.73 | ▁▃▇▁▁ |
lag3 | 0 | 1 | 0.00 | 1.14 | -4.92 | -0.64 | 0.04 | 0.60 | 5.73 | ▁▃▇▁▁ |
lag4 | 0 | 1 | 0.00 | 1.14 | -4.92 | -0.64 | 0.04 | 0.60 | 5.73 | ▁▃▇▁▁ |
lag5 | 0 | 1 | 0.01 | 1.15 | -4.92 | -0.64 | 0.04 | 0.60 | 5.73 | ▁▃▇▁▁ |
volume | 0 | 1 | 1.48 | 0.36 | 0.36 | 1.26 | 1.42 | 1.64 | 3.15 | ▁▇▅▁▁ |
today | 0 | 1 | 0.00 | 1.14 | -4.92 | -0.64 | 0.04 | 0.60 | 5.73 | ▁▃▇▁▁ |
stock_market %>%
ggpairs()
stock_market %>%
ggstatsplot::ggcorrmat()
stock_market %>%
ggplot(aes(direction)) +
geom_bar(aes(fill = direction), show.legend = F) +
geom_text(stat = "count", aes(label = ..count..), vjust = -1) +
scale_fill_jco() +
theme_classic() +
labs(title = "Direction") +
scale_y_continuous(expand = expansion(mult = c(0, 0.25))) +
theme(axis.title = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"))
Going the extra mile to create a function:
plot_fct_boxplot <- function(var_x) {
stock_market %>%
mutate(var_x_fct = factor({{var_x}})) %>% # so that other classes can be used for plot as well
ggplot(aes(var_x_fct)) +
geom_bar(aes(fill = var_x_fct), show.legend = F) +
geom_text(stat = "count", aes(label = ..count..), vjust = -1) +
scale_fill_jco() + # from ggsci package
scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +
theme_classic() +
labs(title = str_to_title(as_label(enquo(var_x))),
x = as_label(enquo(var_x))) +
theme(axis.title = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"))
}
plot_direction <- plot_fct_boxplot(direction)
plot_year <- plot_fct_boxplot(year)
# using patchwork
plot_direction + plot_year
# iterating across variables
fct_var <- stock_market %>%
mutate(year = factor(year)) %>%
select_if(is.factor) %>%
names()
# let the iterations begin
fct_var %>%
syms() %>% # take strings as input and turn them into symbols
map(function(var) plot_fct_boxplot({{var}})) %>%
wrap_plots() # from patchwork
A simple plot:
stock_market %>%
ggplot(aes(lag1)) +
geom_freqpoly(size = 1) +
theme_classic() +
labs(title = "lag1") +
theme(axis.title = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"))
A function:
plot_many_freqpoly <- function(var_x) {
stock_market %>%
ggplot(aes({{var_x}})) +
geom_freqpoly(size = 1) +
theme_classic() +
labs(title = str_to_title(as_label(enquo(var_x))),
x = as_label(enquo(var_x))) +
theme(axis.title = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"))
}
plot_many_freqpoly(lag3)
# set names to loop through
num_var <- stock_market %>%
select(-year) %>%
select_if(is.numeric) %>%
names()
# let the iterations begin
num_var %>%
syms() %>% # take strings as input and turn them into symbols
map(function(var) plot_many_freqpoly({{var}})) %>%
wrap_plots() # from patchwork
Although these plots were seen using the skim() function, I can customise the appearance by building my own functions.
To use 2015 data for testing
To use data prior to 2015 for training
lr_model <-
logistic_reg() %>%
set_engine("glm")
stock_recipe <-
recipe(direction ~ ., data = train_data) %>%
update_role(year, today, new_role = "ID")
summary(stock_recipe)
# A tibble: 9 × 4
variable type role source
<chr> <chr> <chr> <chr>
1 year numeric ID original
2 lag1 numeric predictor original
3 lag2 numeric predictor original
4 lag3 numeric predictor original
5 lag4 numeric predictor original
6 lag5 numeric predictor original
7 volume numeric predictor original
8 today numeric ID original
9 direction nominal outcome original
stock_workflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(stock_recipe)
stock_workflow
══ Workflow ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ──────────────────────────────────────────────────────
0 Recipe Steps
── Model ─────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)
Computational engine: glm
stock_fit <-
stock_workflow %>%
fit(data = train_data)
stock_fit %>%
extract_fit_parsnip() %>%
tidy()
# A tibble: 7 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.191 0.334 0.573 0.567
2 lag1 -0.0542 0.0518 -1.05 0.295
3 lag2 -0.0458 0.0518 -0.884 0.377
4 lag3 0.00720 0.0516 0.139 0.889
5 lag4 0.00644 0.0517 0.125 0.901
6 lag5 -0.00422 0.0511 -0.0826 0.934
7 volume -0.116 0.240 -0.485 0.628
Smallest p-value is assoicatied with lag1, but the negative coefficient suggests that if the market had a positive return yesterday, then it is less likely to go up today.
However, at a p-value of 0.15, the p-value is still relatively large, and there is no clear evidence of a real association vetween lag1 and direction.
predict(stock_fit, test_data)
# A tibble: 252 × 1
.pred_class
<fct>
1 Up
2 Up
3 Up
4 Up
5 Down
6 Up
7 Up
8 Up
9 Up
10 Up
# … with 242 more rows
stock_augment <-
augment(stock_fit, test_data) %>%
select(year, direction, .pred_class, .pred_Down, .pred_Up)
stock_augment
# A tibble: 252 × 5
year direction .pred_class .pred_Down .pred_Up
<dbl> <fct> <fct> <dbl> <dbl>
1 2005 Down Up 0.472 0.528
2 2005 Down Up 0.484 0.516
3 2005 Down Up 0.477 0.523
4 2005 Up Up 0.486 0.514
5 2005 Down Down 0.502 0.498
6 2005 Up Up 0.499 0.501
7 2005 Down Up 0.497 0.503
8 2005 Up Up 0.490 0.510
9 2005 Down Up 0.496 0.504
10 2005 Up Up 0.489 0.511
# … with 242 more rows
stock_augment %>%
roc_curve(truth = direction, .pred_Down) %>%
autoplot()
stock_augment %>%
roc_auc(truth = direction, .pred_Down) # 0.52
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.520
stock_recipe_two_lag <-
recipe(direction ~ ., data = train_data) %>%
update_role(year, today, lag3, lag4, lag5, volume, new_role = "ID")
summary(stock_recipe_two_lag)
# A tibble: 9 × 4
variable type role source
<chr> <chr> <chr> <chr>
1 year numeric ID original
2 lag1 numeric predictor original
3 lag2 numeric predictor original
4 lag3 numeric ID original
5 lag4 numeric ID original
6 lag5 numeric ID original
7 volume numeric ID original
8 today numeric ID original
9 direction nominal outcome original
# update workflow
stock_workflow_update <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(stock_recipe_two_lag)
stock_workflow_update
══ Workflow ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ──────────────────────────────────────────────────────
0 Recipe Steps
── Model ─────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)
Computational engine: glm
# fit data
stock_fit_update <-
stock_workflow_update %>%
fit(data = train_data)
stock_fit_update %>%
extract_fit_parsnip() %>%
tidy()
# A tibble: 3 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.0322 0.0634 0.508 0.611
2 lag1 -0.0556 0.0517 -1.08 0.282
3 lag2 -0.0445 0.0517 -0.861 0.389
# predict
predict(stock_fit_update, test_data)
# A tibble: 252 × 1
.pred_class
<fct>
1 Up
2 Up
3 Up
4 Up
5 Up
6 Up
7 Up
8 Up
9 Up
10 Up
# … with 242 more rows
# augment
stock_augment_update <-
augment(stock_fit_update, test_data) %>%
select(year, direction, .pred_class, .pred_Down, .pred_Up)
stock_augment_update
# A tibble: 252 × 5
year direction .pred_class .pred_Down .pred_Up
<dbl> <fct> <fct> <dbl> <dbl>
1 2005 Down Up 0.490 0.510
2 2005 Down Up 0.479 0.521
3 2005 Down Up 0.467 0.533
4 2005 Up Up 0.474 0.526
5 2005 Down Up 0.493 0.507
6 2005 Up Up 0.494 0.506
7 2005 Down Up 0.495 0.505
8 2005 Up Up 0.487 0.513
9 2005 Down Up 0.491 0.509
10 2005 Up Up 0.484 0.516
# … with 242 more rows
# ROC curve
stock_augment_update %>%
roc_curve(truth = direction, .pred_Down) %>%
autoplot()
stock_augment_update %>%
roc_auc(truth = direction, .pred_Down) # 0.558.
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.558
55.8% of the daily movements were correctly predicted.
lda_model <-
discrim_linear() %>%
set_engine("MASS")
MASS::lda() fits a model that estimates a multivariate distribution for the predictors separately for the data in each class (Gaussian with a common covariance matrix). Bayes’ theorem is used to compute the probability of each class, given the predictor values.
This engine has no tuning parameters, and is used for classification.
The same recipe can be reused.
stock_recipe_two_lag
Recipe
Inputs:
role #variables
ID 6
outcome 1
predictor 2
lda_workflow <-
workflow() %>%
add_model(lda_model) %>%
add_recipe(stock_recipe_two_lag)
lda_fit <-
lda_workflow %>%
fit(data = train_data)
lda_fit %>%
extract_fit_parsnip()
parsnip model object
Fit time: 8ms
Call:
lda(..y ~ ., data = data)
Prior probabilities of groups:
Down Up
0.491984 0.508016
Group means:
lag1 lag2
Down 0.04279022 0.03389409
Up -0.03954635 -0.03132544
Coefficients of linear discriminants:
LD1
lag1 -0.6420190
lag2 -0.5135293
predict(lda_fit, test_data)
# A tibble: 252 × 1
.pred_class
<fct>
1 Up
2 Up
3 Up
4 Up
5 Up
6 Up
7 Up
8 Up
9 Up
10 Up
# … with 242 more rows
lda_augment <-
augment(lda_fit, test_data) %>%
select(year, direction, .pred_class, .pred_Down, .pred_Up)
lda_augment %>%
roc_curve(truth = direction, .pred_Down) %>%
autoplot()
lda_augment %>%
roc_auc(truth = direction, .pred_Down) # 0.558
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.558
log_reg_auc <-
stock_augment_update %>%
roc_curve(truth = direction, .pred_Down) %>%
mutate(model = "Log Regression")
lda_auc <-
lda_augment %>%
roc_curve(truth = direction, .pred_Down) %>%
mutate(model = "LDA")
bind_rows(log_reg_auc, lda_auc) %>%
ggplot(aes(x = 1-specificity,
y = sensitivity,
col = model)) +
geom_path(lwd = 1.5, alpha = 0.6) +
geom_abline(lty = 3) +
coord_equal() +
labs(title = "Comparing LDA with Log Regression",
subtitle = "The predictions are almost identical!") +
theme_classic() +
theme(legend.position = "top")
discrim_quad() defines a model that estimates a multivariate distribution for the predictors separately for the data in each class (usually Gaussian with separate covariance matrices). Bayes’ theorem is used to compute the probability of each class, given the predictor values.
regularization_method : A character string for the type of regularized estimation. Possible values are: “diagonal”, “shrink_cov”, and “shrink_mean” (sparsediscrim engine only).
qda_mod <-
discrim_quad() %>%
set_engine("MASS")
I used the same recipe
stock_recipe_two_lag
Recipe
Inputs:
role #variables
ID 6
outcome 1
predictor 2
qda_workflow <-
workflow() %>%
add_model(qda_mod) %>%
add_recipe(stock_recipe_two_lag)
qda_fit <-
qda_workflow %>%
fit(data = train_data)
qda_fit %>%
extract_fit_parsnip()
parsnip model object
Fit time: 6ms
Call:
qda(..y ~ ., data = data)
Prior probabilities of groups:
Down Up
0.491984 0.508016
Group means:
lag1 lag2
Down 0.04279022 0.03389409
Up -0.03954635 -0.03132544
predict(qda_fit, test_data)
# A tibble: 252 × 1
.pred_class
<fct>
1 Up
2 Up
3 Up
4 Up
5 Up
6 Up
7 Up
8 Up
9 Up
10 Up
# … with 242 more rows
qda_augment <-
augment(qda_fit, test_data) %>%
select(year, direction, .pred_class, .pred_Down, .pred_Up)
qda_augment %>%
roc_auc(truth = direction, .pred_Down) # 0.562
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.562
qda_auc <-
qda_augment %>%
roc_curve(truth = direction, .pred_Down) %>%
mutate(model = "QDA")
Comparing AUC:
auc_log <- stock_augment_update %>%
roc_auc(truth = direction, .pred_Down) %>%
mutate(model = "Log Reg")
auc_lda <- lda_augment %>%
roc_auc(truth = direction, .pred_Down) %>%
mutate(model = "LDA")
auc_qda <- qda_augment %>%
roc_auc(truth = direction, .pred_Down) %>%
mutate(model = "QDA")
bind_rows(auc_log, auc_lda, auc_qda)
# A tibble: 3 × 4
.metric .estimator .estimate model
<chr> <chr> <dbl> <chr>
1 roc_auc binary 0.558 Log Reg
2 roc_auc binary 0.558 LDA
3 roc_auc binary 0.562 QDA
ROC plot:
bind_rows(log_reg_auc,
lda_auc,
qda_auc) %>%
ggplot(aes(x = 1-specificity,
y = sensitivity,
col = model)) +
geom_path(lwd = 1.5, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
labs(title = "Comparing QDA, LDA and Log Regression",
subtitle = "QDA is very slightly better than the other two models!") +
scale_color_jco() +
theme_classic() +
theme(legend.position = "top")
[ISLR2 book] (https://web.stanford.edu/~hastie/ISLRv2_website.pdf)
https://discrim.tidymodels.org/reference/discrim_linear.html
For attribution, please cite this work as
lruolin (2021, Oct. 7). pRactice corner: The Stock Market Data (ISLR2). Retrieved from https://lruolin.github.io/myBlog/posts/20211007 ISLR2 Stockmarket Data/
BibTeX citation
@misc{lruolin2021the, author = {lruolin, }, title = {pRactice corner: The Stock Market Data (ISLR2)}, url = {https://lruolin.github.io/myBlog/posts/20211007 ISLR2 Stockmarket Data/}, year = {2021} }